// this file is ALWAYS called by another do-file

* * *   * * *   * * *   * * *   * * *   * * *
*   LOAD & PREPARE THE DATA   
* * *   * * *   * * *   * * *   * * *   * * *

cd "$mypathRR/Datasets/ESTV/"

* canton id got lost, have to map it back later
use "allcant_taxpayers_coll_B.dta", clear
keep snp_gdenr year cant
collapse cant, by(year snp_gdenr)
gen gemeinde = snp_gdenr
save "cant_gemeinde_mapping.dta", replace

* load income and revenue data
use "allcant_taxpayers_coll_A.dta", clear
rename snp_gdenr gemeinde

merge m:1 gemeinde year using "cant_gemeinde_mapping.dta", keepusing(cant)
drop _merge
rm "cant_gemeinde_mapping.dta"

// correct the outliers
	* winsorize observations with a high reink
	winsor2 reink, cuts(0 99) by(year) replace

* label municipalities
sort gemeinde year
drop if gemeinde==.
run "$mypathRR/Resources/labels-municipalities.do"
label values gemeinde GEMEINDE

* generate a string variable with the municipality names
sdecode gemeinde, gen(Gemeinde) 




* AGGREGATE TO CANTON LEVEL
// gen sums
foreach var in reink stbetr {
	gen `var'_sum = `var' * n_taxpayers
}

// collapse at canton level
collapse (sum) reink_sum stbetr_sum n_taxpayers ///
, by(year cant /*snp_stacd*/ snpbercd snpzivcd)

sort cant year

* generate an indicator for each cell
egen cell = group(cant snpbercd snpzivcd)



* Generate and label the dependent variables
label var n_taxpayers "Total number of taxpayers"
	
foreach var in reink  {
	gen `var'=`var'_sum / n_taxpayers / 10
	label var `var' "Net income p.c. (in 1000 CHF)"
}
  
foreach var in stbetr {
	gen `var'=`var'_sum / n_taxpayers 
	label var `var' "Tax revenue p.c. (in CHF)"
} 

replace reink_sum = reink_sum/10000
  label var reink_sum "Net income sum (in mio CHF)"

replace stbetr_sum = stbetr_sum/1000
label var stbetr_sum "Total tax revenue (in 1000 CHF)"

foreach var in reink  stbetr  {
	gen ln_`var' = ln(`var')
}
label var ln_reink "Log of net income p.c. (in 1000 CHF)"
label var ln_stbetr "Log of tax revenue p.c. (in CHF)"
  

  
* Generate and label the controls *
label var year "Year"

label var n_taxpayers "\# taxpayers in cell"

// rename some vars
rename snpbercd beruf
rename snpzivcd zivcd

label var beruf "Labor market status"
label define BERUF 0  `"employee"', modify
label define BERUF 1  `"self-employed"', modify
label define BERUF 2  `"retiree"', modify
label define BERUF 10 `"non-working"', modify
label values beruf BERUF

gen nowork=(beruf==10)
	label var nowork "Non-working"
gen retiree=(beruf==2)
	label var retiree "Retiree"
gen self=(beruf==1)
	label var self "Self-employed"

label var zivcd "Civil status"
label define CIVIL 0 `"married"', modify
label define CIVIL 1 `"single parents"', modify
label define CIVIL 2 `"single"', modify
label define CIVIL 3 `"married with ch."', modify
label values zivcd CIVIL

gen married=(zivcd==0)
	label var married "Married"
gen singlep=(zivcd==1)
	label var singlep "Single parents"
gen marriedch=(zivcd==3)
	label var marriedch "Married with children"


  

* Gen canton treatment dummies
gen treated = (cant == 6)
  label var treated "Treated"
  
gen period = (year > 2005 & year!=.)
  label var period "Period $ t>2005$"

* Gen DiD Interaction term
gen Interaction = treated*period
  label var Interaction "DiD"


* Gen interaction terms for pre-treatment periods
forval n=1/35 {
local i= 2006 -`n'
gen pre`n'=0
replace pre`n'= 1 if treated == 1 & year == `i'
label var pre`n' "`i' (treatment lag `n')"
}

replace pre1 = 0 // make 2005 the reference year


* drop dummies for years that do not exist in tax data and / or no tax data exists for OW
drop pre34 pre32 pre30 pre28 pre26 pre24 pre22 pre21 pre20 pre18 pre16 pre14 pre12 pre10 pre8 pre7 pre6 
// it is important for ES estimation to have a balanced sample!


* Gen interaction terms for post-treatment periods
forval n=1/11 {
local i = 2005 + `n'
gen post`n' = 0
replace post`n' = 1 if treated == 1 & year == `i'
label var post`n' "`i' (treatment lead `n')"
}

* Gen canton-specific time trends
	tab cant, gen (cant_d)
	foreach var of varlist cant_d* {
	gen trend_`var' = year*`var'
	label var trend_`var' "Canton specific time trend"
	}
	gen trend = year





* * * * define the correct weight variable for each outcome * * * * 

* get number of taxpayers in pre-treatment period
	frame put n_taxpayers cant year cell, into(weights5)
	frame change weights5

		keep if year == 2005
		drop year
		foreach var in n_taxpayers {
			rename `var' `var'_2005
		}
		duplicates tag cell, gen(dups_w5)
		tab dups_w5
		drop if cant == . 
		drop dups_w5
		
	frame change  prepdta1
	
* link back to main frame
	frlink m:1 cell, frame(weights5) gen(link_weights5)
	frget n_taxpayers_2005, from(link_weights5)
	

* replace with actual taxpayers in pre-treatment period
	foreach var in n_taxpayers {
	replace `var'_2005 = `var' if year < 2005
	}
	

* gen outcome-specific weights

foreach outcome in reink ln_reink stbetr ln_stbetr {
	gen weight_`outcome' = n_taxpayers_2005
}

* * *   * * *   * * *   * * *   * * *   * * *
*   TIME TREND ESTIMATION
* * *   * * *   * * *   * * *   * * *   * * *
global outcomes "reink"

* Estimate a time trend for each  outcome	 
cd "$mypathRR/Datasets/ESTV/"
	
* estimate & predict canton trend
	preserve
		foreach outcome in $outcomes  {
			reg `outcome' trend_* cant_d* if year < 2006 & year > 1994 [aweight = weight_`outcome']
			predict tr`outcome', xb
			label var tr`outcome' "cantonal time trend in `outcome'"
		}
		
		keep  cant year tr*
		drop trend* treated
		bys cant year: keep if _n==1
	save "est_canton_trends-canton_level-with_controls.dta", replace
	restore 	 
	drop cant_d*

* match trend back to original data
	merge m:1 year cant using "est_canton_trends-canton_level-with_controls.dta", nogen keep(1 3)
	rm "est_canton_trends-canton_level-with_controls.dta"
	

	
* generate residualized outcomes
foreach outcome in $outcomes {
	reg `outcome' tr`outcome' [aweight = weight_`outcome']
	predict resid`outcome', residual
	label var resid`outcome' "residualized `outcome' (after taking out canton trend)"
}
